Personal Computer World 2009 February

home *** CD-ROM | disk | FTP | other *** search

/ Personal Computer World 2009 February / PCWFEB09.iso / Software / Resources / Chat & Communication / Digsby build 37 / digsby_setup.exe / lib / HTMLParser.pyo (.txt) < prev next >

Wrap

Python Compiled Bytecode | 2008-10-13 | 10KB | 296 lines

# Source Generated with Decompyle++ # File: in.pyo (Python 2.5) import markupbase import re interesting_normal = re.compile('[&<]') interesting_cdata = re.compile('<(/|\\Z)') incomplete = re.compile('&[a-zA-Z#]') entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') starttagopen = re.compile('<[a-zA-Z]') piclose = re.compile('>') commentclose = re.compile('--\\s*>') tagfind = re.compile('[a-zA-Z][-.a-zA-Z0-9:_]*') attrfind = re.compile('\\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\\s*=\\s*(\\\'[^\\\']*\\\'|"[^"]*"|[-a-zA-Z0-9./,:;+*%?!&$\$\$_#=~@]*))?') locatestarttagend = re.compile('\n <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name\n (?:\\s+ # whitespace before attribute name\n (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name\n (?:\\s*=\\s* # value indicator\n (?:\'[^\']*\' # LITA-enclosed value\n |\\"[^\\"]*\\" # LIT-enclosed value\n |[^\'\\">\\s]+ # bare value\n )\n )?\n )\n )*\n \\s* # trailing whitespace\n', re.VERBOSE) endendtag = re.compile('>') endtagfind = re.compile('</\\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\\s*>') class HTMLParseError(Exception): def __init__(self, msg, position = (None, None)): self.msg = msg self.lineno = position[0] self.offset = position[1] def __str__(self): result = self.msg if self.lineno is not None: result = result + ', at line %d' % self.lineno if self.offset is not None: result = result + ', column %d' % (self.offset + 1) return result class HTMLParser(markupbase.ParserBase): CDATA_CONTENT_ELEMENTS = ('script', 'style') def __init__(self): self.reset() def reset(self): self.rawdata = '' self.lasttag = '???' self.interesting = interesting_normal markupbase.ParserBase.reset(self) def feed(self, data): self.rawdata = self.rawdata + data self.goahead(0) def close(self): self.goahead(1) def error(self, message): raise HTMLParseError(message, self.getpos()) __starttag_text = None def get_starttag_text(self): return self._HTMLParser__starttag_text def set_cdata_mode(self): self.interesting = interesting_cdata def clear_cdata_mode(self): self.interesting = interesting_normal def goahead(self, end): rawdata = self.rawdata i = 0 n = len(rawdata) while i < n: match = self.interesting.search(rawdata, i) if match: j = match.start() else: j = n if i < j: self.handle_data(rawdata[i:j]) i = self.updatepos(i, j) if i == n: break startswith = rawdata.startswith None if startswith('<', i) else match if startswith('&', i): match = entityref.match(rawdata, i) if match: name = match.group(1) self.handle_entityref(name) k = match.end() if not startswith(';', k - 1): k = k - 1 i = self.updatepos(i, k) continue match = incomplete.match(rawdata, i) if match: if end and match.group() == rawdata[i:]: self.error('EOF in middle of entity or char ref') break elif i + 1 < n: self.handle_data('&') i = self.updatepos(i, i + 1) else: break match if end and i < n: self.handle_data(rawdata[i:n]) i = self.updatepos(i, n) self.rawdata = rawdata[i:] def parse_pi(self, i): rawdata = self.rawdata match = piclose.search(rawdata, i + 2) if not match: return -1 j = match.start() self.handle_pi(rawdata[i + 2:j]) j = match.end() return j def parse_starttag(self, i): self._HTMLParser__starttag_text = None endpos = self.check_for_whole_start_tag(i) if endpos < 0: return endpos rawdata = self.rawdata self._HTMLParser__starttag_text = rawdata[i:endpos] attrs = [] match = tagfind.match(rawdata, i + 1) k = match.end() self.lasttag = tag = rawdata[i + 1:k].lower() while k < endpos: m = attrfind.match(rawdata, k) if not m: break (attrname, rest, attrvalue) = m.group(1, 2, 3) if not rest: attrvalue = None elif "'" == "'": pass elif not "'" == attrvalue[-1:]: if '"' == '"': pass elif '"' == attrvalue[-1:]: attrvalue = attrvalue[1:-1] attrvalue = self.unescape(attrvalue) attrs.append((attrname.lower(), attrvalue)) k = m.end() continue attrvalue[:1] end = rawdata[k:endpos].strip() if end not in ('>', '/>'): (lineno, offset) = self.getpos() if '\n' in self._HTMLParser__starttag_text: lineno = lineno + self._HTMLParser__starttag_text.count('\n') offset = len(self._HTMLParser__starttag_text) - self._HTMLParser__starttag_text.rfind('\n') else: offset = offset + len(self._HTMLParser__starttag_text) self.error('junk characters in start tag: %r' % (rawdata[k:endpos][:20],)) if end.endswith('/>'): self.handle_startendtag(tag, attrs) else: self.handle_starttag(tag, attrs) if tag in self.CDATA_CONTENT_ELEMENTS: self.set_cdata_mode() return endpos def check_for_whole_start_tag(self, i): rawdata = self.rawdata m = locatestarttagend.match(rawdata, i) if m: j = m.end() next = rawdata[j:j + 1] if next == '>': return j + 1 if next == '/': if rawdata.startswith('/>', j): return j + 2 if rawdata.startswith('/', j): return -1 self.updatepos(i, j + 1) self.error('malformed empty start tag') if next == '': return -1 if next in 'abcdefghijklmnopqrstuvwxyz=/ABCDEFGHIJKLMNOPQRSTUVWXYZ': return -1 self.updatepos(i, j) self.error('malformed start tag') raise AssertionError('we should not get here!') def parse_endtag(self, i): rawdata = self.rawdata match = endendtag.search(rawdata, i + 1) if not match: return -1 j = match.end() match = endtagfind.match(rawdata, i) if not match: self.error('bad end tag: %r' % (rawdata[i:j],)) tag = match.group(1) self.handle_endtag(tag.lower()) self.clear_cdata_mode() return j def handle_startendtag(self, tag, attrs): self.handle_starttag(tag, attrs) self.handle_endtag(tag) def handle_starttag(self, tag, attrs): pass def handle_endtag(self, tag): pass def handle_charref(self, name): pass def handle_entityref(self, name): pass def handle_data(self, data): pass def handle_comment(self, data): pass def handle_decl(self, decl): pass def handle_pi(self, data): pass def unknown_decl(self, data): self.error('unknown declaration: %r' % (data,)) def unescape(self, s): if '&' not in s: return s s = s.replace('<', '<') s = s.replace('>', '>') s = s.replace(''', "'") s = s.replace('"', '"') s = s.replace('&', '&') return s